# -*- coding: utf-8 -*-
"""
Code for whether a section is enacted in the same congressional term
First created: 4/21/23
Final edit: 8/22/23

"""

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import time
import os
import csv

cong = [103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114]

for i in cong:
   num = str(i)
   os.chdir('/replication/cleaned_sections/'+num+'/')
   secs_all = pd.read_csv(num+"_sections_noboilerplate.csv")
   secs_all = secs_all.drop(['Unnamed: 0'], axis=1)

   #Get introduced and enrolled sections
   secs_all['cong'] = secs_all['bill'].str[:3]
   secs_all['cong'] = secs_all['cong'].astype(int)
   sec_intro = secs_all[secs_all['bill'].str.contains("ih|is")]
   sec_enr = secs_all[secs_all['bill'].str.contains("enr")]
   secs_all=[]

   #Create lists for loop
   secs_intro_list= sec_intro.values.tolist()
   secs_enr_list = sec_enr.values.tolist()
   counter1 = len(secs_intro_list)
   counter2 = len(secs_enr_list) 

   os.chdir('/replication/enacted_sections/')
   
   #Create csv file for whether introduced section is enacted.
   header=['bill1','sec1','raw_txt1','clean_txt1', 'date1','bill2','sec2','raw_txt2','clean_txt2', 'date2','txt1_ad_5', 'txt2_ad_5', 'scope_10', 'num_blocks', 'perblock_txt1', 'perblock_txt2', "first_shared_keys_2", "first_txt1_ad_2", "first_txt2_ad_2", 'match', 'x', 'y']
   with open('secs'+num+'_enacted.csv', 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)

#Function that uses decision tree to match sections. 
#Run functions in decisiontree_functions.py and decision_tree.py beforehand
   def pairwise_dice(num1, txts1, num2, txts2):
       for x in range(num1):
         if isinstance(txts1[x][3], float)==True: pass
         else:
             print(num, x)
             for y in range(num2):
                 new_sect = []
                 if txts1[x][5]>=txts2[y][5]: pass #Only look at sections that come before the eenacted bill
                 elif isinstance(txts2[y][3], float)==True: pass
                 else:
                     dice_comp = dice(txts1[x][3],txts2[y][3]) 
                     if dice_comp<=0.35: pass #Skips any 2 matches with a Dice score less than 0.35
                     else:
                         score5 = gen_hash(txts1[x][3],txts2[y][3],5,txts1[x][0],txts2[y][0]) 
                         score10 = gen_hash(txts1[x][3],txts2[y][3],10,txts1[x][0],txts2[y][0]) 
                         block1 = blocks(txts1[x][3],txts2[y][3])
                         score_first2 = gen_hash_first100(txts1[x][3],txts2[y][3],5,txts1[x][0],txts2[y][0])
                         #Create new list of all the variables I want
                         new_sect.append(txts1[x][0]) #sec1
                         new_sect.append(txts1[x][1]) #sec1
                         if len(txts1[x][2]) < 30000: new_sect.append(txts1[x][2]) #raw (limit on size due to csv restrictions)
                         else: new_sect.append('')
                         if len(txts1[x][3]) < 30000: new_sect.append(txts1[x][3]) #processed (limit on size due to csv restrictions)
                         else: new_sect.append('') 
                         new_sect.append(txts1[x][5]) #date
                         new_sect.append(txts2[y][0]) #sec2
                         new_sect.append(txts2[y][1]) #sec2
                         if len(txts2[y][2]) < 30000: new_sect.append(txts2[y][2]) #raw
                         else: new_sect.append('') 
                         if len(txts2[y][3]) < 30000: new_sect.append(txts2[y][3]) #processed
                         else: new_sect.append('') 
                         new_sect.append(txts2[y][5]) #date
                         new_sect.append(score5[3])
                         new_sect.append(score5[4])
                         new_sect.append(score10[5])
                         new_sect.append(block1[1])
                         new_sect.append(block1[4])
                         new_sect.append(block1[5])
                         new_sect.append(score_first2[2])
                         new_sect.append(score_first2[3])
                         new_sect.append(score_first2[4])
                         x_full = np.asarray([[new_sect[10], new_sect[11], new_sect[12], new_sect[13], new_sect[14], new_sect[15], new_sect[16], new_sect[17], new_sect[18]]])
                         y_full = clf_model_minor.predict(x_full)
                         y_full_int = y_full[0]
                         new_sect.append(y_full_int)
                         if y_full_int==1: #Append if decision tree matches sections
                             new_sect.append(x)
                             new_sect.append(y)
                             print(num,x,y)
                             with open('secs'+num+'_enacted.csv', 'a', encoding='UTF8', newline='') as f:
                                 writer = csv.writer(f)
                                 writer.writerow(new_sect)
                         else: pass
                     
   start = time.process_time()
   comparisons = pairwise_dice(counter1, secs_intro_list, counter2, secs_enr_list)
   print(time.process_time() - start)
